On my honor, as a student, I have neither given nor received unauthorized aid on this academic work. The assignment I am submitting is my own containing my own codes and words.
# import packages
import requests as rq
import pandas as pd
import numpy as np
import random as rd
from lxml import html
import csv
# for plotting
import matplotlib as plt
%matplotlib inline
import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
from plotly.graph_objs import *
init_notebook_mode()
import seaborn as sns
# Import csv file into dataframe (df) for data cleaning
# data from NYC Open Data (https://data.cityofnewyork.us/Housing-Development/DOB-Job-Application-Filings/ic3t-wcy2)
df = pd.read_csv('data/dob_job_application_filings.csv', sep=',', low_memory=False)
# copy dataset
df2 = df.copy()
# To see and display top 5 rows
df2.head()
# See the end (tail) of the data, or last 5 rows
df2.tail()
# See the shape of the data.
df2.shape
# See the columns
df2.columns
# Find out data types
df2.info()
# Find frequencies of specific landmarked column data using dot notation. Because of the amont of data, we can shorten
# our search to the first 5 rows by adding .head() at the end. This is a way to slice multiple methods into one statement.
# df['Zip'].value_counts(dropna=False) would be the bracket notation and provide the same results.
df2.Landmarked.value_counts(dropna=False).head()
# we can also search the last 5 rows by using tail()
df2.Landmarked.value_counts(dropna=False).tail()
# If a column name has a space, it is best to use the bracket notation.
df2['Fully Paid'].value_counts(dropna=False).head()
# General summary of data
df2.describe()
# View data in PC Filed column to find missing values
df2['PC Filed'].describe()
# using df3 variable so as to continue work with df2 dataset
df3 = df2.copy()
df3 = df3.drop(['PC Filed'], axis=1)
df3.head()
# see info of last 5 rows.
df3.tail()
# using ffill() function to forward fill data with zero
df4 = df2.copy()
df4 = df4.ffill(0)
df4.head()
# using bfill() function to back fill data with the value of 1
df5 = df2.copy()
df5 = df5.bfill(1)
df5.head()
# display the last 5 rows
df5.tail()
# copy before testing
df6 = df2.copy()
# replace null value with the median value
df6 = df6.fillna({'Age': 28})
df6.isnull().sum()
# replace null value with the most popular value in a categorial columns
df6 = df6.fillna({'Embarked': 'S'})
df6.isnull().sum()
# hello in Owner's Business name is inaccurate ...replacing inaccurate value with null value
df6 = df6.replace('None', np.nan)
df6.head()
# 350 is an inaccurate value for age and should be replaced by 35
# changinge single value with replace
df6 = df6.replace({"Owner's Business Name": 'NA'}, {"Owner's Business Name": 'No Name Given'})
df6.head()
# pandas object
df7 = df2.copy()
# for one column
df7['Pre- Filing Date'] = pd.Timestamp('20130425')
df7.tail()
# for multiple columns
df7['Paid'] = pd.Timestamp('20130425')
df7['Fully Paid'] = pd.Timestamp('20130425')
df7['Assigned'] = pd.Timestamp('20130425')
df7['Approved'] = pd.Timestamp('20130425')
df7['Fully Permitted'] = pd.Timestamp('20130425')
# drop all columns that contain missing or null data.
df8 = df2.copy()
df8 = df8.dropna(axis=1)
df8.head()
# last 5 rows
df8.tail()
# data info
df8.info()
df9 = pd.DataFrame(df2)
df9['x'] = df9['Existing Zoning Sqft']
df9['y'] = df9['Proposed Zoning Sqft']
# Scatter plot
sns.lmplot('x', 'y', data=df9, fit_reg=False)
# Box plot
sns.boxplot([df9.x, df9.y])
# Density plot
sns.kdeplot(df9['x'])
# iplot "scatterplot" for visualization.
dt = [Scatter(x=df9.x, y=df9.y, mode = 'markers')]
iplot(dt)
# create a pivot, which helps you develop insights about the data in question.
df10 = df8.copy()
df10.groupby('Job #').count()
# save back to csv file (this example is only partially clean; you want to make sure you have used all code to clean the data first)
df10.to_csv("data/cleaned_data.csv", encoding="utf-8")